import pandas as pd
import numpy as np
import spacy
import tqdm
#instantiating English module
nlp = spacy.load('en')
setement=pd.read_csv('data.csv')
setement.head()
| Sentence | Sentiment | |
|---|---|---|
| 0 | The GeoSolutions technology will leverage Bene... | positive |
| 1 | $ESI on lows, down $1.50 to $2.50 BK a real po... | negative |
| 2 | For the last quarter of 2010 , Componenta 's n... | positive |
| 3 | According to the Finnish-Russian Chamber of Co... | neutral |
| 4 | The Swedish buyout firm has sold its remaining... | neutral |
We transform Sentiment into "int" Data 🇰
for index,raw in setement.iterrows():
if raw['Sentiment'] == 'positive' :
setement['Sentiment'][index]=1
if raw['Sentiment'] == 'negative' :
setement['Sentiment'][index]=-1
if raw['Sentiment'] == 'neutral' :
setement['Sentiment'][index]=0
setement.head()
| Sentence | Sentiment | |
|---|---|---|
| 0 | The GeoSolutions technology will leverage Bene... | 1 |
| 1 | $ESI on lows, down $1.50 to $2.50 BK a real po... | -1 |
| 2 | For the last quarter of 2010 , Componenta 's n... | 1 |
| 3 | According to the Finnish-Russian Chamber of Co... | 0 |
| 4 | The Swedish buyout firm has sold its remaining... | 0 |
setement['Sentence'][0]
"The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model ."
We import tools to clean text
%%time
nlp(setement['Sentence'][1])
CPU times: user 19.8 ms, sys: 931 µs, total: 20.7 ms Wall time: 29.6 ms
$ESI on lows, down $1.50 to $2.50 BK a real possibility
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
import re
import string
We clean all sentence in our data frame :
%%time
clean_sen = []
pbar = tqdm.tqdm(total=len(setement['Sentence']),position=0, leave=True)
for text in nlp.pipe(setement['Sentence'], disable=["tagger", "parser", "ner"]):
txt = [token.lemma_.lower() for token in text
if token.is_alpha
and not token.is_stop
and not token.is_punct]
clean_sen.append(txt)
pbar.update(1)
99%|█████████▉| 5771/5842 [00:01<00:00, 5114.68it/s]
CPU times: user 1.28 s, sys: 26.3 ms, total: 1.31 s Wall time: 1.33 s
WE replace sentecnce with a list of words ✅
setement['Sentence']=clean_sen
setement.head()
| Sentence | Sentiment | |
|---|---|---|
| 0 | [geosolutions, technology, leverage, benefon, ... | 1 |
| 1 | [esi, low, bk, real, possibility] | -1 |
| 2 | [quarter, componenta, net, sale, double, be, b... | 1 |
| 3 | [according, finnish, russian, chamber, commerc... | 0 |
| 4 | [swedish, buyout, firm, sell, remain, percent,... | 0 |
print(setement['Sentence'].isnull().sum() , 'NO NULL great day' )
0 NO NULL great day
We install the packages which we need to transform the words to vectors as Word2vec:
!pip install --upgrade gensim -q
|████████████████████████████████| 24.1 MB 54.3 MB/s
import logging
from gensim.models import Word2Vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
WE transform each word to vector of 200 element and we count word which are more then 10 ⚔
w2v_model = Word2Vec(sentences=setement['Sentence'], vector_size=200, window=5, min_count=10, workers=2, epochs=10)
2022-03-13 20:30:14,056 : INFO : collecting all words and their counts
2022-03-13 20:30:14,060 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-03-13 20:30:14,085 : INFO : collected 8176 word types from a corpus of 62135 raw words and 5842 sentences
2022-03-13 20:30:14,087 : INFO : Creating a fresh vocabulary
2022-03-13 20:30:14,101 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 1108 unique words (13.551859099804306%% of original 8176, drops 7068)', 'datetime': '2022-03-13T20:30:14.101571', 'gensim': '4.1.2', 'python': '3.7.12 (default, Jan 15 2022, 18:48:18) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.144+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'prepare_vocab'}
2022-03-13 20:30:14,104 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 leaves 46932 word corpus (75.53230868270701%% of original 62135, drops 15203)', 'datetime': '2022-03-13T20:30:14.104061', 'gensim': '4.1.2', 'python': '3.7.12 (default, Jan 15 2022, 18:48:18) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.144+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'prepare_vocab'}
2022-03-13 20:30:14,114 : INFO : deleting the raw counts dictionary of 8176 items
2022-03-13 20:30:14,117 : INFO : sample=0.001 downsamples 62 most-common words
2022-03-13 20:30:14,121 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 39333.126082498784 word corpus (83.8%% of prior 46932)', 'datetime': '2022-03-13T20:30:14.121734', 'gensim': '4.1.2', 'python': '3.7.12 (default, Jan 15 2022, 18:48:18) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.144+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'prepare_vocab'}
2022-03-13 20:30:14,144 : INFO : estimated required memory for 1108 words and 200 dimensions: 2326800 bytes
2022-03-13 20:30:14,146 : INFO : resetting layer weights
2022-03-13 20:30:14,153 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2022-03-13T20:30:14.153479', 'gensim': '4.1.2', 'python': '3.7.12 (default, Jan 15 2022, 18:48:18) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.144+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'build_vocab'}
2022-03-13 20:30:14,156 : INFO : Word2Vec lifecycle event {'msg': 'training model with 2 workers on 1108 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-03-13T20:30:14.156062', 'gensim': '4.1.2', 'python': '3.7.12 (default, Jan 15 2022, 18:48:18) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.144+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'train'}
2022-03-13 20:30:14,232 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-13 20:30:14,233 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-13 20:30:14,242 : INFO : EPOCH - 1 : training on 62135 raw words (39281 effective words) took 0.1s, 537866 effective words/s
2022-03-13 20:30:14,315 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-13 20:30:14,321 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-13 20:30:14,324 : INFO : EPOCH - 2 : training on 62135 raw words (39335 effective words) took 0.1s, 548904 effective words/s
2022-03-13 20:30:14,399 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-13 20:30:14,401 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-13 20:30:14,406 : INFO : EPOCH - 3 : training on 62135 raw words (39257 effective words) took 0.1s, 551728 effective words/s
2022-03-13 20:30:14,489 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-13 20:30:14,493 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-13 20:30:14,494 : INFO : EPOCH - 4 : training on 62135 raw words (39300 effective words) took 0.1s, 506079 effective words/s
2022-03-13 20:30:14,572 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-13 20:30:14,578 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-13 20:30:14,580 : INFO : EPOCH - 5 : training on 62135 raw words (39380 effective words) took 0.1s, 561959 effective words/s
2022-03-13 20:30:14,655 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-13 20:30:14,658 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-13 20:30:14,659 : INFO : EPOCH - 6 : training on 62135 raw words (39346 effective words) took 0.1s, 578077 effective words/s
2022-03-13 20:30:14,736 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-13 20:30:14,741 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-13 20:30:14,742 : INFO : EPOCH - 7 : training on 62135 raw words (39422 effective words) took 0.1s, 567588 effective words/s
2022-03-13 20:30:14,824 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-13 20:30:14,827 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-13 20:30:14,829 : INFO : EPOCH - 8 : training on 62135 raw words (39358 effective words) took 0.1s, 549884 effective words/s
2022-03-13 20:30:14,914 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-13 20:30:14,920 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-13 20:30:14,923 : INFO : EPOCH - 9 : training on 62135 raw words (39384 effective words) took 0.1s, 483433 effective words/s
2022-03-13 20:30:14,998 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-13 20:30:15,001 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-13 20:30:15,002 : INFO : EPOCH - 10 : training on 62135 raw words (39354 effective words) took 0.1s, 569837 effective words/s
2022-03-13 20:30:15,004 : INFO : Word2Vec lifecycle event {'msg': 'training on 621350 raw words (393417 effective words) took 0.8s, 465480 effective words/s', 'datetime': '2022-03-13T20:30:15.004254', 'gensim': '4.1.2', 'python': '3.7.12 (default, Jan 15 2022, 18:48:18) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.144+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'train'}
2022-03-13 20:30:15,006 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=1108, vector_size=200, alpha=0.025)', 'datetime': '2022-03-13T20:30:15.005879', 'gensim': '4.1.2', 'python': '3.7.12 (default, Jan 15 2022, 18:48:18) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.144+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
w2v_model.wv.similar_by_word('finance') #similarty to finance
[('partnership', 0.9983425140380859),
('way', 0.9983341693878174),
('know', 0.9982154369354248),
('recycle', 0.9979555010795593),
('germany', 0.9977225065231323),
('responsible', 0.997700572013855),
('buyout', 0.99757319688797),
('advance', 0.9974996447563171),
('upgrade', 0.9974631667137146),
('competition', 0.997349739074707)]
w2v_model.wv.similar_by_word('good') #similartiy to good
[('near', 0.9981837272644043),
('like', 0.9975825548171997),
('look', 0.9969451427459717),
('signal', 0.9968292117118835),
('actual', 0.996150016784668),
('clear', 0.9958698749542236),
('think', 0.9958350658416748),
('get', 0.995799720287323),
('setup', 0.995729923248291),
('car', 0.9954541921615601)]
w2v_model.wv.similar_by_word('low') #similartiy to good
[('pct', 0.9471407532691956),
('turnover', 0.9287186861038208),
('bln', 0.9285021424293518),
('october', 0.9227984547615051),
('end', 0.9222664833068848),
('percent', 0.9185442924499512),
('see', 0.9164886474609375),
('warn', 0.9158691763877869),
('fiscal', 0.9089294075965881),
('trade', 0.9077963829040527)]
WE take avrege of each vector of word to make one vectors of each sentence in our Data frame : with this function that i found it in teacher code ☝
def get_mean_vector(w2v_model, words):
# remove out-of-vocabulary words
words = [word for word in words if word in w2v_model.wv.key_to_index]
if len(words) >= 1:
return np.mean(w2v_model.wv[words], axis=0)
else:
return []
w2v_model.save('w2v_model')
2022-03-13 20:30:15,104 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'w2v_model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-03-13T20:30:15.098881', 'gensim': '4.1.2', 'python': '3.7.12 (default, Jan 15 2022, 18:48:18) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.144+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'saving'}
2022-03-13 20:30:15,110 : INFO : not storing attribute cum_table
2022-03-13 20:30:15,147 : INFO : saved w2v_model
avg_vecs =setement['Sentence'].map(lambda t: get_mean_vector(w2v_model, t))
avg_vecs
0 [-0.023638105, -0.2225693, 0.05494887, -0.0885...
1 [-0.049470887, -0.18776272, -0.13305257, 0.161...
2 [0.11238689, 0.04761131, -0.22016947, 0.495715...
3 [-0.010964988, -0.17172924, 0.012902707, 0.039...
4 [0.012547172, -0.10590384, -0.07147517, 0.1087...
...
5837 [0.0020562652, -0.12607355, -0.009878085, -0.0...
5838 [-0.0032976754, -0.12049678, -0.025588552, 0.0...
5839 [0.0036183472, -0.10471592, -0.022085711, 0.03...
5840 [0.07394402, 0.033335775, -0.16605195, 0.35318...
5841 [-0.036986586, -0.21293043, -0.016931655, 0.09...
Name: Sentence, Length: 5842, dtype: object
len(avg_vecs)
5842
Aaaaaah there is problem some vectors are null
so we delete them esay ❎
len(avg_vecs[29])==0
True
to_del=[]
for i in range(0,5841) :
if len(avg_vecs[i])==0 :
to_del.append(i)
to_del
[29, 175, 292, 318, 497, 921, 1081, 1190, 1191, 1487, 1637, 1794, 1991, 2102, 2115, 2145, 2474, 2912, 2987, 3373, 3630, 3954, 4005, 4374, 4720, 4875, 4955, 5123, 5345, 5521, 5545, 5693, 5746, 5811]
for i in to_del :
del avg_vecs[i]
We trosform our vector to a matrix :
avg_vecs = np.vstack(avg_vecs)
avg_vecs.shape
(5808, 200)
from sklearn.metrics.pairwise import cosine_similarity
We virefey similarty with Row FIVE 🐾
sims = cosine_similarity(avg_vecs[4].reshape(1,200), avg_vecs)
# extract indices
ix = np.flip(np.argsort(sims)).tolist()[0][:10]
setement['Sentence'][ix]
4 [swedish, buyout, firm, sell, remain, percent,... 126 [tyc, break, new, high, lot, technicals, look,... 2509 [crh, concrete, bid, holcim, lafarge, asset] 5505 [incap, contract, manufacturing, carry, manufa... 4237 [financial, report, publish, friday, seb, say,... 5685 [time, finnair, award, agreement, uk, government] 3011 [upset, machinery, work, take, poland] 3660 [renesas, mobile, europe, ltd, approximately, ... 1427 [result, taxis, loss, million, euro] 570 [operating, profit, decrease, eur, mn, eur, mn] Name: Sentence, dtype: object
query = 'good year'
query = nlp(query, disable=["tagger", "parser", "ner"])
query = [token.lemma_.lower() for token in query
if token.is_alpha
and not token.is_stop
and not token.is_punct]
query = get_mean_vector(w2v_model,query)
sims = cosine_similarity(query.reshape(1,200), avg_vecs)
ix = np.flip(np.argsort(sims)).tolist()[0][:10]
setement['Sentence'][ix]
2410 [competition, authority, approve, deal, finalize] 1469 [skf, april, alandsbanken, give, buy, recommen... 2995 [delhaize, le, lion, select, aldata, vocal, pd... 4024 [company, see, net, profit, quarter, be, be, c... 1132 [public, service, available] 4060 [previously, order, set, start, produce, elect... 1342 [company, say, measure, long, need] 4884 [nokia, music, store, begin, trade, tuesday, s... 1648 [aldata, say, numb, operational, aspect, defin... 16 [msft, sql, server, revenue, grow, double, dig... Name: Sentence, dtype: object
In this section we try to cluster data and see the result if there is cluster of positive , negative , neutre sentences or non . I think No this kind of stuff need to be supervised but let try : ▶
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
inertias = [] # our WSS
ks = range(1, 10)
# Creating 10 K-Mean++ models while varying the number of clusters (k)
for k in range(1,10):
model = KMeans (n_clusters = k ,init = 'k-means++')
# Fit model to samples
model.fit(avg_vecs)
# Append the inertia to the list of inertias
inertias.append(model.inertia_)
plt.plot(range(1,10), inertias, '-p', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.plot(3,inertias[2],'ro')
plt.xticks(ks)
plt.title('Elbow for K-mean++')
plt.show()
Yes , that was expected Three Cluster ✅
data_set=pd.read_csv('data.csv')
data_set.drop(to_del)
| Sentence | Sentiment | |
|---|---|---|
| 0 | The GeoSolutions technology will leverage Bene... | positive |
| 1 | $ESI on lows, down $1.50 to $2.50 BK a real po... | negative |
| 2 | For the last quarter of 2010 , Componenta 's n... | positive |
| 3 | According to the Finnish-Russian Chamber of Co... | neutral |
| 4 | The Swedish buyout firm has sold its remaining... | neutral |
| ... | ... | ... |
| 5837 | RISING costs have forced packaging producer Hu... | negative |
| 5838 | Nordic Walking was first used as a summer trai... | neutral |
| 5839 | According shipping company Viking Line , the E... | neutral |
| 5840 | In the building and home improvement trade , s... | neutral |
| 5841 | HELSINKI AFX - KCI Konecranes said it has won ... | positive |
5808 rows × 2 columns
data=pd.DataFrame(avg_vecs)
We make a averge vectors in the same row with the sentences :
df=pd.merge(data_set, data, left_index=True, right_index=True) ## We make a averge vectors in the same row with the sentences
df
| Sentence | Sentiment | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | ... | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | The GeoSolutions technology will leverage Bene... | positive | -0.007924 | -0.223087 | 0.053513 | -0.092287 | 0.186106 | -0.143922 | -0.302044 | 0.109837 | ... | -0.033534 | -0.125585 | 0.200090 | -0.013439 | 0.066113 | 0.089940 | 0.098243 | -0.173657 | -0.023096 | -0.217823 |
| 1 | $ESI on lows, down $1.50 to $2.50 BK a real po... | negative | -0.033407 | -0.185608 | -0.139528 | 0.187546 | 0.125491 | -0.186068 | 0.108543 | 0.425107 | ... | 0.044660 | -0.114231 | 0.106686 | -0.050289 | 0.072860 | -0.021438 | 0.026665 | -0.150976 | 0.046502 | -0.203028 |
| 2 | For the last quarter of 2010 , Componenta 's n... | positive | 0.125046 | 0.056649 | -0.220464 | 0.496381 | 0.126217 | -0.286173 | 0.592212 | 0.714471 | ... | 0.326551 | -0.209325 | -0.442689 | -0.256762 | 0.310491 | 0.020907 | 0.157587 | -0.183014 | -0.015510 | -0.124678 |
| 3 | According to the Finnish-Russian Chamber of Co... | neutral | 0.003853 | -0.171919 | 0.011540 | 0.036366 | 0.172062 | -0.160615 | -0.117759 | 0.222415 | ... | 0.044890 | -0.146413 | 0.079376 | -0.080953 | 0.115625 | 0.044681 | 0.110950 | -0.165525 | -0.010940 | -0.183199 |
| 4 | The Swedish buyout firm has sold its remaining... | neutral | 0.024353 | -0.102986 | -0.071792 | 0.106625 | 0.153534 | -0.182802 | 0.021705 | 0.330730 | ... | 0.084876 | -0.143799 | -0.000739 | -0.076875 | 0.111007 | 0.051909 | 0.072806 | -0.160228 | -0.009412 | -0.167364 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5803 | Clothing chain Sepp+ñl+ñ 's net sales increase... | positive | 0.014964 | -0.123877 | -0.011971 | -0.002653 | 0.151584 | -0.148771 | -0.136416 | 0.191442 | ... | 0.037576 | -0.106276 | 0.076302 | -0.042566 | 0.076620 | 0.080352 | 0.100525 | -0.170094 | -0.012681 | -0.175547 |
| 5804 | The administrators have indicated a need for 9... | negative | 0.009051 | -0.120605 | -0.027529 | 0.019491 | 0.144585 | -0.134330 | -0.100360 | 0.208948 | ... | 0.049316 | -0.107199 | 0.074076 | -0.034060 | 0.070594 | 0.071173 | 0.084963 | -0.170542 | -0.005605 | -0.166664 |
| 5805 | The new office , located in Shenzhen , will st... | positive | 0.015072 | -0.102433 | -0.023300 | 0.035408 | 0.145768 | -0.153834 | -0.076065 | 0.228040 | ... | 0.059994 | -0.120730 | 0.038342 | -0.057840 | 0.096389 | 0.069993 | 0.097647 | -0.167855 | -0.012384 | -0.168423 |
| 5806 | However , the total orders received will still... | positive | 0.091973 | 0.042430 | -0.168200 | 0.359545 | 0.153028 | -0.240070 | 0.363694 | 0.629277 | ... | 0.313023 | -0.168646 | -0.320361 | -0.212850 | 0.263384 | 0.111964 | 0.193941 | -0.328517 | -0.024283 | -0.208992 |
| 5807 | Rivals say Qualcomm has fewer patents on 3G ph... | neutral | -0.024842 | -0.219929 | -0.014696 | 0.088125 | 0.133905 | -0.148023 | -0.015218 | 0.232311 | ... | 0.003410 | -0.122417 | 0.091498 | -0.068555 | 0.111113 | -0.039087 | 0.037718 | -0.075832 | 0.005582 | -0.158783 |
5808 rows × 202 columns
X=df.drop(columns=['Sentence' , 'Sentiment'])
X
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.007924 | -0.223087 | 0.053513 | -0.092287 | 0.186106 | -0.143922 | -0.302044 | 0.109837 | 0.044898 | 0.341305 | ... | -0.033534 | -0.125585 | 0.200090 | -0.013439 | 0.066113 | 0.089940 | 0.098243 | -0.173657 | -0.023096 | -0.217823 |
| 1 | -0.033407 | -0.185608 | -0.139528 | 0.187546 | 0.125491 | -0.186068 | 0.108543 | 0.425107 | -0.084094 | 0.152714 | ... | 0.044660 | -0.114231 | 0.106686 | -0.050289 | 0.072860 | -0.021438 | 0.026665 | -0.150976 | 0.046502 | -0.203028 |
| 2 | 0.125046 | 0.056649 | -0.220464 | 0.496381 | 0.126217 | -0.286173 | 0.592212 | 0.714471 | -0.383984 | -0.069170 | ... | 0.326551 | -0.209325 | -0.442689 | -0.256762 | 0.310491 | 0.020907 | 0.157587 | -0.183014 | -0.015510 | -0.124678 |
| 3 | 0.003853 | -0.171919 | 0.011540 | 0.036366 | 0.172062 | -0.160615 | -0.117759 | 0.222415 | -0.040032 | 0.246415 | ... | 0.044890 | -0.146413 | 0.079376 | -0.080953 | 0.115625 | 0.044681 | 0.110950 | -0.165525 | -0.010940 | -0.183199 |
| 4 | 0.024353 | -0.102986 | -0.071792 | 0.106625 | 0.153534 | -0.182802 | 0.021705 | 0.330730 | -0.078210 | 0.163306 | ... | 0.084876 | -0.143799 | -0.000739 | -0.076875 | 0.111007 | 0.051909 | 0.072806 | -0.160228 | -0.009412 | -0.167364 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5803 | 0.014964 | -0.123877 | -0.011971 | -0.002653 | 0.151584 | -0.148771 | -0.136416 | 0.191442 | -0.020190 | 0.225283 | ... | 0.037576 | -0.106276 | 0.076302 | -0.042566 | 0.076620 | 0.080352 | 0.100525 | -0.170094 | -0.012681 | -0.175547 |
| 5804 | 0.009051 | -0.120605 | -0.027529 | 0.019491 | 0.144585 | -0.134330 | -0.100360 | 0.208948 | -0.025042 | 0.204545 | ... | 0.049316 | -0.107199 | 0.074076 | -0.034060 | 0.070594 | 0.071173 | 0.084963 | -0.170542 | -0.005605 | -0.166664 |
| 5805 | 0.015072 | -0.102433 | -0.023300 | 0.035408 | 0.145768 | -0.153834 | -0.076065 | 0.228040 | -0.045848 | 0.189398 | ... | 0.059994 | -0.120730 | 0.038342 | -0.057840 | 0.096389 | 0.069993 | 0.097647 | -0.167855 | -0.012384 | -0.168423 |
| 5806 | 0.091973 | 0.042430 | -0.168200 | 0.359545 | 0.153028 | -0.240070 | 0.363694 | 0.629277 | -0.296727 | 0.043329 | ... | 0.313023 | -0.168646 | -0.320361 | -0.212850 | 0.263384 | 0.111964 | 0.193941 | -0.328517 | -0.024283 | -0.208992 |
| 5807 | -0.024842 | -0.219929 | -0.014696 | 0.088125 | 0.133905 | -0.148023 | -0.015218 | 0.232311 | -0.067261 | 0.193201 | ... | 0.003410 | -0.122417 | 0.091498 | -0.068555 | 0.111113 | -0.039087 | 0.037718 | -0.075832 | 0.005582 | -0.158783 |
5808 rows × 200 columns
model = KMeans(n_clusters=3, init = 'k-means++', n_init = 5)
model.fit(X)
# coordinates of cluster center
centroids = model.cluster_centers_
# cluster label for each data point
labels = model.labels_
plt.scatter(X[0], X[6], c=labels)
plt.scatter( centroids[:, 0], centroids[:, 1], s=300, marker='*', c='red', edgecolor='black')
plt.title('K-mean++ visualization')
plt.show()
labels=pd.DataFrame(labels)
labels.columns = ['labels']
df=pd.merge(df, labels, left_index=True, right_index=True)
!pip install altair
Requirement already satisfied: altair in /usr/local/lib/python3.7/dist-packages (4.2.0) Requirement already satisfied: toolz in /usr/local/lib/python3.7/dist-packages (from altair) (0.11.2) Requirement already satisfied: pandas>=0.18 in /usr/local/lib/python3.7/dist-packages (from altair) (1.3.5) Requirement already satisfied: jinja2 in /usr/local/lib/python3.7/dist-packages (from altair) (2.11.3) Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from altair) (1.21.5) Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.7/dist-packages (from altair) (4.3.3) Requirement already satisfied: entrypoints in /usr/local/lib/python3.7/dist-packages (from altair) (0.4) Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema>=3.0->altair) (5.4.0) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from jsonschema>=3.0->altair) (3.10.0.2) Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema>=3.0->altair) (4.11.2) Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema>=3.0->altair) (0.18.1) Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema>=3.0->altair) (21.4.0) Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from importlib-resources>=1.4.0->jsonschema>=3.0->altair) (3.7.0) Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.18->altair) (2018.9) Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.18->altair) (2.8.2) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=0.18->altair) (1.15.0) Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2->altair) (2.0.1)
import altair as alt
df.columns = df.columns.map(str)
df1=df[:3500] ### Altair have problem with more then 5000 raws
chart=alt.Chart(df1).mark_point().encode(
x='12', ## the elemnts of vector
y='1', ## the elements of vectors
color='labels',
tooltip=['Sentence', 'Sentiment']
)
chart